531c86c028e98348e54ce5dc9c28dcfdf23a6d2d,src/core/org/terrier/indexing/TaggedDocument.java,TaggedDocument,getNextTerm,#,267
Before Change
}
if (tag_close) {
//System.err.println("processing close " + tagName);
if ((_tags.isTagToProcess(tagName) || _tags.isTagToSkip(tagName)) && !tagName.equals("")) {
processEndOfTag(upperCaseTagName);
String stackTop = null;
if (!stk.isEmpty()) {
stackTop = stk.peek();
if (_tags.isTagToProcess(stackTop)) {
inTagToProcess = true;
inTagToSkip = false;
} else {
inTagToProcess = false;
inTagToSkip = true;
continue;
}
} else {
inTagToProcess = false;
inTagToSkip = false;
}
}
if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
htmlStk.remove(upperCaseTagName);
}
}
After Change
//the string to return as a result at the end of this method.
String s = null;
//StringBuilder sw = null;
String tagName = null;
boolean endOfTagName;
//are we in a body of a tag?
boolean btag = true;
int ch = 0;
//while not the end of document, or the end of file, or we are in a tag
while (btag && ch != -1 && !EOD) {
//initialise the stringbuffer with the maximum length of a term (heuristic)
//sw = new StringBuilder(tokenMaximumLength);
boolean tag_close = false;
boolean tag_open = false;
error = false;
try {
if (lastChar == '<' || lastChar == '&') {
ch = lastChar;
lastChar = -1;
}
//If not EOF and ch.isNotALetter and ch.isNotADigit and
//ch.isNot '<' and ch.isNot '&'
//CONSUME: whitespace
//while ((ch < 1 && ch != '<' && ch != '&') || Character.isWhitespace((char)ch))
while (ch != -1 && (( ch != '<' && ch != '&') && Character.isWhitespace((char)ch)))
{
ch = br.read();
counter++;
//if ch is '>' (end of tag), then there is an error.
if (ch == '>')
error = true;
}
//IDENTIFIES: start of opening or closing tags
if (ch == '<') {
ch = br.read();
counter++;
//if it is a closing tag, set tag_f true
if (ch == '/') {
ch = br.read();
counter++;
tag_close = true;
} else if (ch == '!') { //else if it is a comment, that is <!
counter++;
ch = br.read();
if (ch == '[')
{
counter++;
//CDATA block, read until another [
while ((ch = br.read()) != '[' && ch != -1) {
counter++;
}
}
else
{ //it is a comment
//read until you encounter a '<', or a '>', or the end of file
while ((ch = br.read()) != '>' && ch != '<' && ch != -1) {
counter++;
}
counter++;
}
} else {
tag_open = true; //otherwise, it is an opening tag
}
}
if (ch == '&' ) {
//read until an opening or the end of a tag is encountered, or the
//end of file, or a space, or a semicolon,
//which means the end of the escape sequence &xxx;
while ((ch = br.read()) != '>' &&
ch != '<' &&
ch != ' ' &&
ch != ';' &&
ch != -1) {
counter++;
}
counter++;
}
//if the body of a tag is encountered
if ((btag = (tag_close || tag_open))) {
endOfTagName = false;
//read until the end of file, or the start, or the end
//of a tag, and save the content of the tag
while (ch != -1 && ch != '<' && ch != '>') {
if (! endOfTagName)
tagNameSB.append((char)ch);
ch = br.read();
counter++;
if (! endOfTagName && Character.isWhitespace((char)ch)) {
endOfTagName = true;
tagName = tagNameSB.toString();
upperCaseTagName = StringTools.toUpperCase(tagName);
//System.err.println("Found tag " + tagName + (tag_open ? " open" : " close") );
tagNameSB.setLength(0);
}
}
//ch = br.read();counter++;
if (! endOfTagName)
{
tagName = tagNameSB.toString();
upperCaseTagName = StringTools.toUpperCase(tagName);
//System.err.println("Found tag " + tagName+ (tag_open ? " open" : " close"));
tagNameSB.setLength(0);
}
} else { //otherwise, if we are not in the body of a tag
//read text to tokenise
if (((char)ch) == '>') {
counter++;
ch = br.read();
}
while (ch != -1 && ch != '<' && ch != '&')
{
sw.append((char)ch);
ch = br.read();
counter++;
}
// while (ch != -1
// && (//ch=='&' ||
// ((ch >= 'A') && (ch <= 'Z'))
// || ((ch >= 'a') && (ch <= 'z'))
// || ((ch >= '0') && (ch <= '9')))) {
// sw.append((char)ch);
// ch = br.read();
// counter++;
// }
}
lastChar = ch;
s = sw.toString();
sw.setLength(0);
if (tagName != null && !tagName.equals(""))
{
if (tag_open) {
//System.err.println("processing open " + tagName);
final boolean tagToProcess = _tags.isTagToProcess(tagName);
if (tagToProcess || _tags.isTagToSkip(tagName)) {
stk.push(upperCaseTagName);
if (tagToProcess) {
inTagToProcess = true;
inTagToSkip = false;
} else {
inTagToSkip = true;
inTagToProcess = false;
continue;
}
}
if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
htmlStk.add(upperCaseTagName);
inHtmlTagToProcess = true;
}
}
if (tag_close) {
//System.err.println("processing close " + tagName);
final boolean tagToProcess = _tags.isTagToProcess(tagName);
if (tagToProcess || _tags.isTagToSkip(tagName)) {
processEndOfTag(upperCaseTagName);
String stackTop = null;
if (!stk.isEmpty()) {
stackTop = stk.peek();
if (_tags.isTagToProcess(stackTop)) {
inTagToProcess = true;
inTagToSkip = false;
} else {
inTagToProcess = false;
inTagToSkip = true;
continue;
}
} else {
inTagToProcess = false;
inTagToSkip = false;
}
}
if (_fields.isTagToProcess(tagName)) {
htmlStk.remove(upperCaseTagName);
}
}